main_alleles <- c("WT", "KRAS_G12A", "KRAS_G12C", "KRAS_G12D", "KRAS_G12V", "KRAS_G13D")
dusp_rna_data <- readRDS("~/Downloads/coad-dusp-rna_expression.rds") %.% {
filter(!is_hypermutant)
select(-is_hypermutant)
mutate(
allele = ifelse(ras_allele %in% !!main_alleles, ras_allele, "Other"),
allele = str_remove(allele, "KRAS_"),
allele = factor_alleles(allele)
)
}
# Put DUSP genes in order.
dusp_order <- unique(dusp_rna_data$hugo_symbol)
dusp_num <- as.numeric(str_remove_all(dusp_order, "[:alpha:]"))
dusp_order <- dusp_order[order(dusp_num)]
dusp_rna_data$hugo_symbol <- factor(dusp_rna_data$hugo_symbol, levels = dusp_order)
A sample of the RNA expression data table.
dusp_rna_data %>%
rename(
`DUSP` = hugo_symbol,
`tumor sample` = tumor_sample_barcode,
`RNA (RSEM)` = rna_expr
) %>%
select(-ras_allele) %>%
head() %>%
kbl() %>%
kable_styling(bootstrap_options = c("striped", "hover"))
| DUSP | tumor sample | RNA (RSEM) | allele |
|---|---|---|---|
| DUSP10 | TCGA-3L-AA1B | 254.352 | WT |
| DUSP10 | TCGA-4N-A93T | 133.527 | G12D |
| DUSP10 | TCGA-4T-AA8H | 275.635 | G12V |
| DUSP10 | TCGA-5M-AAT4 | 507.745 | G12D |
| DUSP10 | TCGA-5M-AAT5 | 572.762 | G12D |
| DUSP10 | TCGA-5M-AATA | 209.145 | WT |
The number of tumor samples with missing data for each DUSP gene.
dusp_rna_data %>%
filter(is.na(rna_expr)) %>%
count(hugo_symbol, sort = TRUE) %>%
rename(DUSP = hugo_symbol, num = n) %>%
kbl() %>%
kable_paper(bootstrap_options = "striped", full_width = FALSE, position = "left")
| DUSP | num |
|---|---|
| DUSP13 | 149 |
| DUSP21 | 149 |
The number of tumor samples with 0 RNA expression values for each DUSP gene.
dusp_rna_data %>%
filter(rna_expr <= 0) %>%
count(hugo_symbol) %>%
rename(DUSP = hugo_symbol, num = n) %>%
kbl() %>%
kable_paper(bootstrap_options = "striped", full_width = FALSE, position = "left")
| DUSP | num |
|---|---|
| DUSP5P | 73 |
| DUSP9 | 34 |
| DUSP13 | 217 |
| DUSP15 | 2 |
| DUSP21 | 284 |
| DUSP26 | 58 |
| DUSP27 | 11 |
All negative RNA expressionvalues were set to 0.
dusp_rna_data %<>% mutate(rna_expr = map_dbl(rna_expr, ~ max(0, .x)))
dusp_rna_data %>%
filter(!is.na(rna_expr)) %>%
mutate(rna_expr = rna_expr + 1) %>%
ggplot(aes(x = allele, y = rna_expr)) +
facet_wrap(~hugo_symbol, scales = "free_y") +
geom_boxplot(outlier.shape = NA) +
scale_y_continuous(trans = "log10") +
theme(
panel.grid.major.x = element_blank(),
axis.text = element_text(size = 8),
axis.text.x = element_text(angle = 45, hjust = 1)
) +
labs(x = NULL, y = "RNA expression (log10 + 1)")
DUSP13 and DUSP21 were removed from the analysis because they were missing data and had very low expression levels.
IGNORE_DUSPS <- c("DUSP13", "DUSP21")
dusp_rna_data %<>% filter(!hugo_symbol %in% IGNORE_DUSPS)
plot_dusp_distribtions <- function(df, x) {
df %>%
ggplot(aes(x = {{ x }})) +
facet_wrap(~hugo_symbol, scales = "free") +
scale_y_continuous(expand = expansion(c(0, 0.02))) +
geom_density() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
axis.text = element_text(size = 7),
axis.text.x = element_text(angle = 30, hjust = 1),
strip.text = element_text(size = 7),
panel.spacing = unit(1, "mm")
) +
labs(x = "RNA expression")
}
plot_dusp_distribtions(dusp_rna_data, rna_expr) +
ggtitle("Non-transformed RNA expression values")
dusp_rna_data %>%
mutate(rna_expr = log10(rna_expr + 1)) %>%
plot_dusp_distribtions(rna_expr) +
ggtitle("log10(RNA + 1) transformed data")
dusp_rna_data %>%
mutate(rna_expr = sqrt(rna_expr)) %>%
plot_dusp_distribtions(rna_expr) +
ggtitle("sqrt(RNA) transformed data")
dusp_rna_data %>%
group_by(hugo_symbol) %>%
mutate(rna_expr = scale_numeric(rna_expr)) %>%
ungroup() %>%
plot_dusp_distribtions(rna_expr)+
ggtitle("Centralized and scaled (z-scale) data")
dusp_rna_data %>%
mutate(rna_expr = sqrt(rna_expr)) %>%
group_by(hugo_symbol) %>%
mutate(rna_expr = scale_numeric(rna_expr)) %>%
ungroup() %>%
plot_dusp_distribtions(rna_expr)+
ggtitle("sqrt(RNA) & z-scaled data")
The \(\log_10(\text{RNA} + 1)\), centralized, and scaled values will be used for the analysis.
dusp_rna_data %<>%
mutate(log10_rna_expr = log10(rna_expr + 1)) %>%
group_by(hugo_symbol) %>%
mutate(log10_z_rna = scale_numeric(log10_rna_expr)) %>%
ungroup()
dusp_corr <- dusp_rna_data %>%
pivot_wider(id = tumor_sample_barcode, names_from = hugo_symbol, values_from = log10_z_rna) %>%
select(-tumor_sample_barcode) %>%
corrr::correlate()
#>
#> Correlation method: 'pearson'
#> Missing treated using: 'pairwise.complete.obs'
dusp_corr_pheat <- dusp_corr %>%
as.data.frame() %>%
column_to_rownames("rowname")
colnames(dusp_corr_pheat) <- str_remove(colnames(dusp_corr_pheat), "DUSP")
rownames(dusp_corr_pheat) <- str_remove(rownames(dusp_corr_pheat), "DUSP")
pheatmap::pheatmap(
dusp_corr_pheat,
border_color = NA,
na_col = "white",
main = "Correlation of DUSP gene expression",
)
corrr::network_plot(dusp_corr, min_cor = 0.3) +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
labs(title = "DUSP gene expression correlation network")
new_allele_order <- as.character(sort(unique(dusp_rna_data$allele)))
new_allele_order <- c("WT", new_allele_order[new_allele_order != "WT"])
data <- dusp_rna_data %>%
mutate(
grp_allele = case_when(
allele == "G13D" ~ "G13D",
allele == "WT" ~ "WT",
TRUE ~ "KRAS"
),
grp_allele = factor(grp_allele, levels = c("WT", "G13D", "KRAS")),
allele = factor(as.character(allele), levels = new_allele_order)
)
# FOR TESTING
set.seed(0)
TESTING_DUSPS <- paste0("DUSP", 1:6)
TESTING_TSBS <- sample(unique(data$tumor_sample_barcode), 50)
data <- data %.%{
filter(hugo_symbol %in% TESTING_DUSPS)
filter(tumor_sample_barcode %in% TESTING_TSBS)
}
stash("m1", depends_on = "data", {
m1 <- stan_glmer(log10_z_rna ~ (1 + allele|hugo_symbol), data = data)
})
#> Loading stashed object.
mcmc_trace(m1, pars = "(Intercept)") / mcmc_trace(m1, pars = "sigma")
mcmc_trace(m1, regex_pars = "DUSP1")
pp_check(m1, plotfun = "stat", stat = "mean")
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pp_check(m1, plotfun = "stat_2d", stat = c("mean", "sd"))
as.data.frame(m1) %.% {
mutate(draw = row_number())
select(draw, `(Intercept)`, tidyselect::contains("DUSP"))
pivot_longer(
-c(draw, `(Intercept)`),
names_to = "parameter",
values_to = "value"
)
mutate(
parameter = str_remove_all(parameter, "[:punct:]"),
parameter = str_remove_all(parameter, "b|allele|hugosymbol|")
)
separate(parameter, c("allele", "dusp"), sep=" ")
mutate(allele = str_replace(allele, "Intercept", "WT"))
} %>%
ggplot(aes(value)) +
facet_wrap(~ dusp, scales = "free") +
geom_vline(xintercept = 0, lty = 2, color = "grey50", size = 0.9) +
geom_density(aes(color = allele), size = 1) +
scale_color_manual(values = short_allele_pal) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = expansion(c(0, 0.02))) +
labs(x = "posterior distribution", y = "probability density", color = "KRAS allele")
bayestestR::describe_posterior(m1, effects = "all")
#> Possible multicollinearity between Sigma[hugo_symbol:alleleG13D,alleleG13D] and Sigma[hugo_symbol:alleleG12V,alleleG12V] (r = 0.83), Sigma[hugo_symbol:alleleOther,alleleOther] and Sigma[hugo_symbol:alleleG13D,alleleG13D] (r = 0.78). This might lead to inappropriate results. See 'Details' in '?rope'.